Run the following commands sequentially:
module load apps/binapps/jupyter-notebook/6.0.0
module load apps/gcc/R/3.6.0
## also run the following if you have connection problems
module load tools/env/proxy2
Run the Jupyter notebook server as a cluster job with the number of cores and memory appropriate to the analysis; examples are:
jupyter-notebook-csf # runs a serial (1-core) job - on a 4-6GB/core
jupyter-notebook-csf -p 2 -m 16 # runs a parallel 2-core job - on a 16GB RAM/core
# Recommended for this example dataset
Your job 657643 ("jnotebook") has been submitted Checking if job has already started…
Run qstat to make sure it is running on the cluster. Or type:
jupyter-notebook-csf -c 657643
You should see (as example):
Once the notebook job is running you can now set up the SSH tunnel on your local computer (not the CSF):
cat jnotebook.o657643
Now to enable access from your local computer, run the following in your local machine [as shown on
cat jnotebook.o657643]
ssh -L 8888:hnode120:8888 [your-uni-id]@csf3.itservices.manchester.ac.uk
DO NOT EXIT until you have finished your tunnelled session.
On your local browser (browser in your local computer NOT CSF) use the following URL:
http://localhost:8888
If you are asked for a token to login to the notebook, have a look in this job's .e file for the token by running the following command:
jupyter-notebook-csf -t 657643
(you should only run the above command after the batch job has started)
After finishing, you must run the following command on the CSF3 to terminate the Jupyter batch job:
qdel 657643
This notebook documents the steps for the aggregated data with no normalisation by Cellranger
Researcher: Louisa Nelson
PI: Stephen Taylor
Analyst: Syed Murtuza Baker
Living biobanks are powerful resources, with the transformative aspect coming from the ability to perform detailed phenotypic studies on well-characterised models that accurately reflect a patient’s tumour, and in turn, the ability to correlate ex vivo observations with clinical chemotherapy responses. As such, living biobanks can potentially address limitations associated with established cancer cell lines, and indeed, our analysis shows that thus far, we have grossly underestimated the mitotic dysfunction in advanced human tumours.
These samples are basically an addition to the one sample in the paper that was analysed previously. They are cells from patients with ovarian cancer, we isolated the tumour and stromal cells for each patient. And have mixed them back together to give 1 sample per patient.
The aims were
From the first sample p53 and XIST were differentially expressed so you could use these to validate the initial clustering. The analysis after that would be the same as the previous sample
75%:25% ratioXIST and TP53 are.http://bartzabel.ls.manchester.ac.uk/syed/2o332gW33C/web_summary.html
#!/bin/bash --login
#$ -cwd # Job will run from the current directory
#$ -pe smp.pe 12 # Number of cores (can be 2--32)
cellranger count --id LN1_AS38 --fastqs /mnt/fls01-bcf01/ngsdata/Analysis/2019/nextseq/190709_NB500968_0129_AHVMT2BGXB_analysis/LouisaNelson/fastqs --sample LN1_AS38 --transcriptome /mnt/data-sets/bcf/softwareData/10xGenomics-CellRanger/3.0.0/refdata-cellranger-GRCh38-3.0.0 --jobmode=sge --mempercore=4
Run the following command to aggregate all samples
#!/bin/bash --login
#$ -cwd # Job will run from the current directory
#$ -pe smp.pe 12 # Number of cores (can be 2--32)
cellranger aggr --id=KY_aggr_noDepthNormalization --csv=aggr.csv --jobmode=sge --mempercore=5 --normalize=none
You need to create the aggr.csv file first as
| library_id | molecule_h5 |
|---|---|
| LN1_AS38 | /mnt/mr01-home01/mqbsxsm2/scratch/Louisa_Nelson_10XGenomics_scRNA-seq/LN1_AS38/outs/molecule_info.h5 |
| LN2_AS59_3 | /mnt/mr01-home01/mqbsxsm2/scratch/Louisa_Nelson_10XGenomics_scRNA-seq/LN2_AS59_3/outs/molecule_info.h5 |
| LN3_AS74 | /mnt/mr01-home01/mqbsxsm2/scratch/Louisa_Nelson_10XGenomics_scRNA-seq/LN3_AS74/outs/molecule_info.h5 |
| LN4_AS79 | /mnt/mr01-home01/mqbsxsm2/scratch/Louisa_Nelson_10XGenomics_scRNA-seq/LN4_AS79/outs/molecule_info.h5 |
## Loading the libraries
## edits required to paths of R libraries
library(Seurat)
library(scater)
library(ggplot2)
library(scran)
library(GGally)
library(mclust)
library(Rtsne)
library(viridis)
library(umap)
library(tidyverse)
library(paletteer)
library(biomaRt)
library(org.Hs.eg.db)#human
# library("org.Mm.eg.db",lib.loc="/mnt/fls01-home01/mqbsslaz/R/x86_64-pc-linux-gnu-library/3.6/")#mouse
library (scater)
library(dynamicTreeCut)
library(ComplexHeatmap)
library(factoextra)
library(cluster)
library(edgeR)
writeLines(capture.output(sessionInfo()), "sessionInfo.txt")
sessionInfo()
load(".RData")
#cbPalette <- paletteer_d(package = "ggthemes", palette="calc", n=12)
c30 <- c("dodgerblue2",#1
"#E31A1C", #2 red
"green4", #3
"#FF7F00", #4 orange
"green1",#5
"purple",#6
"blue1",#7
"deeppink1",#8
"darkorange4",#9
"black",#10
"gold1",#11
"darkturquoise",#12
"#6A3D9A", #13 purple
"orchid1",#14
"gray70",#15
"maroon",#16
"palegreen2",#17
"#333333",#18
"#CAB2D6", #19 lt purple
"#FDBF6F", #20 lt orange
"khaki2",#21
"skyblue2",#22
"steelblue4",#23
"green1",#24
"yellow4",#25
"yellow3",#26
"#FB9A99", #27 lt pink
"brown",#28
"#000099",#29
"#CC3300"#30
)
pie(rep(1,30), col=c30)
# Choosing colours for samples
c_sample_col <- c30[c(3,22,19,30)]
# Choosing colour for samples
c_clust_col <- c30[c(1,2,3,4,5,6,7,8,11,14,15,16,17,23,24)]
__edits required before next step__
## Importing cellranger data for this dataset. EDITS REQUIRED
cellranger_pipestance_path <- "filtered_feature_bc_matrix_workshop/"
projectData <- Read10X(cellranger_pipestance_path)
#Checks: columns are cells
projectData[1:5,1:5]
#Checks: size of data, genes and cell
dim(projectData)
## Sample information. EDITS REQUIRED
## One sample
#Sample1_barcodes <- data.frame(barcode=colnames(project_data), Sample="Eva_S1")
## Two (or more) samples
Sample1_barcodes <- data.frame(barcode=colnames(projectData)[grep("1", colnames(projectData))], Sample="Patient1")
Sample2_barcodes <- data.frame(barcode=colnames(projectData)[grep("2", colnames(projectData))], Sample="Patient2")
Sample3_barcodes <- data.frame(barcode=colnames(projectData)[grep("3", colnames(projectData))], Sample="Patient3")
Sample4_barcodes <- data.frame(barcode=colnames(projectData)[grep("4", colnames(projectData))], Sample="Patient4")
print(paste0('Number of Sample1 cells: ',dim(Sample1_barcodes)[1]))
print(paste0('Number of Sample2 cells: ',dim(Sample2_barcodes)[1]))
print(paste0('Number of Sample3 cells: ',dim(Sample3_barcodes)[1]))
print(paste0('Number of Sample4 cells: ',dim(Sample4_barcodes)[1]))
#Barcodes
#For single sample
#annotBarcode <- Sample1_barcodes
#rownames(annotBarcode) <- annotBarcode$barcode
#head(annotBarcode)
## For multiple samples
annotBarcode <- rbind(Sample1_barcodes, Sample2_barcodes, Sample3_barcodes, Sample4_barcodes)
rownames(annotBarcode) <- annotBarcode$barcode
head(annotBarcode)
Analysis note Creating the SingleCellExperiment object using scater.
#make scater object
cdSc <- SingleCellExperiment(assays = list(counts = as.matrix(projectData)))#this object from library scater
#as.matrix allows it to load because problems sparse matrix and dense matrix. Actually starts sparse, then dense, then sparse!
colData(cdSc)$barcode <- annotBarcode$barcode #colData for cells
colData(cdSc)$Sample <- annotBarcode$Sample
rowData(cdSc)$symbol <- rownames(projectData) #rowData for genes
cdSc <- scater::calculateQCMetrics(cdSc)
cdSc
We will use colData() function to access the metadata related with cells
#Checks: should be about 12
names(colData(cdSc))
We use rowData() function to access the metadata related with genes
#Checks: dropouts are statistical negatives, could be true negatives or false negatives (not detected properly)
names(rowData(cdSc))
#this normalises prior to our own filtering. So gets redone, but sometimes requested.
exprs(cdSc) <- log2(scater::calculateCPM(cdSc, use_size_factors = TRUE) + 1)
Normalisation notes Syed note about an older version of scater which automatically normalised the counts in its "log10counts". Not doing that at the moment. Here we use log2-counts-per-million with an offset of 1 as the exprs values. We have to note that the CPM for scater is different than the regular CPM calculation as we are considering the step-size here.
The value of the log-CPMs is explained by adding a prior count to avoid undefined values after the log-transformation, multiplying by a million, and dividing by the mean library size. This size factors are used to define the effective library sizes. This is done by scaling all size factors such that the mean scaled size factor is equal to the mean sum of counts across all features. The effective library sizes are then used to in the denominator of the CPM calculation. The way that scater calculates the log-normalized counts are
lib.sizes <- colSums(counts(example_sceset))
lib.sizes <- lib.sizes/mean(lib.sizes)
log2(counts(example_sceset)[1,]/lib.sizes+1)
Below are different plots to visualize the summary statistics.
Low-quality cells need to be identified and removed to ensure that the technical effects do not distort downstream analysis results. Three common measures of cell quality are:
The library size is defined as the total sum of counts across all genes. Cells with relatively small library sizes are considered to be of low quality as the RNA has not been efficiently captured (i.e., converted into cDNA and amplified) during library preparation.
The number of expressed genes in each cell is defined as the number of genes with non-zero counts for that cell. Any cell with very few expressed genes is likely to be of poor quality as the diverse transcript population has not been successfully captured.
High proportions of reads mapping to mitochondrial genes are indicative of poor-quality cells (Ilicic et al., 2016; Islam et al., 2014), possibly because of increased apoptosis and/or loss of cytoplasmic RNA from lysed cells.
# Human
my.ids <- gsub('\\..*','',tolower(rownames(cdSc)))
ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl", mirror = "useast")
#ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl", host = "useast.ensembl.org")
chrName <- getBM(attributes=c('ensembl_gene_id','hgnc_symbol','chromosome_name'), filters = 'hgnc_symbol', values =my.ids, mart = ensembl)
is.mito <- chrName$chromosome_name == "MT" & !is.na(chrName$chromosome_name)
sum(is.mito)
## Mouse
#is.mito <- grepl("^mt-", rowData(cdSc)$symbol)
#cdSc <- calculateQCMetrics(cdSc, feature_controls=list(Mt=is.mito))
#print(paste0('Number of annotated mitochondrial genes = ',as.character(sum(is.mito))))
cdSc <- calculateQCMetrics(cdSc, feature_controls=list(Mt=is.mito))
names(colData(cdSc))
## Plotting the histograms of QC entries.
par(mfrow=c(2,2))
hist((cdSc$total_counts)/1e6, xlab="Library sizes (millions)", main="Histogram of Library size",
breaks=100, col="grey80", ylab="Number of cells")
hist(cdSc$total_features_by_counts, xlab="Number of expressed genes", main="Histogram of No. of Features",
breaks=100, col="grey80", ylab="Number of cells")
hist(cdSc$pct_counts_Mt, xlab="Mitochondrial proportion (%)",
ylab="Number of cells", breaks=100, main="Histogram of Mictochondria %", col="grey80")
summary(cdSc$pct_counts_Mt)#keep an eye on Max, i.e. cells with high mito
MITOCHONDRIAL GENES QC NOTES
MITOCHONDRIAL GENES QC NOTES FOR THIS DATASET __edits required__
par(mfrow=c(1,3))
plot(cdSc$total_features_by_counts, cdSc$total_counts/1e6, xlab="Number of expressed genes",
ylab="Library size (millions)")
plot(cdSc$total_features_by_counts, cdSc$pct_counts_Mt, xlab="Number of expressed genes",
ylab="Mitochondrial proportion (%)")
plot(cdSc$total_counts/1e6, cdSc$pct_counts_Mt, xlab="Total counts (in millions)",
ylab="Mitochondrial proportion (%)")
## How the counts are distributed across the genes
plotRowData(cdSc, x = "n_cells_by_counts", y = "log10_total_counts", colour_by = "is_feature_control_Mt")
This plot shows log10 total counts per gene (y-axis) versus number of cells expressing that gene (x-axis). Generally, in single cell datasets there are some genes with very low, or very high, total counts, which accounts for the S shape of the plot.
Mitochondrial genes (marked in orange) often appear at the top right corner of the figure since they are highly expressed in most cells. It is only a problem if these genes make up the majority of a cell's reads (more than 80%). These problematic cells are removed further down this workflow.
plotRowData(cdSc, x = "n_cells_by_counts", y = "log10_total_counts", colour_by = "pct_dropout_by_counts")
Dropouts is a term for genes with no read counts. As expected, cells with low number of reads have higher percentage of gene dropouts.
## How the counts are distributed across the cells
plotColData(cdSc, x = "log10_total_counts", y = "log10_total_features_by_counts", col = "pct_counts_Mt")
Cells on the linear pattern show a good correlation between total counts and the number of gene features with counts. Problematic cells deviate from the linear trend.
Cells with a very high proportion of mitchondrial reads are problematic (e.g. in apoptosis) (Ilicic et al., 2016; Islam et al., 2014). They will be removed by the QC filtering.
plotColData(cdSc, x = "log10_total_counts", y = "log10_total_features_by_counts", col = "Sample")
plotColData(cdSc, x = "log10_total_features_by_counts", y="log10_total_counts", size_by ="total_counts_Mt", colour_by = "pct_counts_Mt")
Picking thresholds for filtering out poor cells is not straightforward for different metrics as their absolute values depend on the protocol and biological system. For example, sequencing to greater depth will lead to more reads, regardless of the quality of the cells. To obtain an adaptive threshold, the assumption made here is that most of the dataset consists of high-quality cells. Plots to facilitate picking thresholds for cell cutoffs are below.
cut_off_reads <- median(cdSc$log10_total_counts) - 3*mad(cdSc$log10_total_counts)
df <- data.frame(x=cdSc$log10_total_counts, Sample = cdSc$Sample)
plot_reads <- ggplot(df,
aes(x = x, fill = as.factor(Sample))) +
geom_density(alpha = 0.5) +
geom_vline(xintercept = cut_off_reads, colour="grey", linetype = "longdash") +
labs(x = expression('log'[10]*'(Library Size)'), title = "Total reads density", fill = "Sample") +
theme_classic(base_size = 14)+
scale_fill_manual(values=c_sample_col)
cut_off_mRNA <- median(cdSc$log10_total_features_by_counts) - 3*mad(cdSc$log10_total_features_by_counts)
df <- data.frame(x=cdSc$log10_total_features_by_counts, Sample = cdSc$Sample)
plot_mRNA <- ggplot(df,
aes(x = x, fill = as.factor(Sample))) +
geom_density(alpha = 0.5) +
geom_vline(xintercept = cut_off_mRNA, colour="grey", linetype = "longdash") +
labs(x = expression('log'[10]*'(Number of expressed genes)'), title = "Total genes expressed", fill = "Sample") +
theme_classic(base_size = 14)+
scale_fill_manual(values=c_sample_col)
cut_off_MT <- median(cdSc$pct_counts_Mt) + 3*mad(cdSc$pct_counts_Mt)
df <- data.frame(x=cdSc$pct_counts_Mt, Sample = cdSc$Sample)
plot_MT <- ggplot(df,
aes(x = x, fill = as.factor(Sample))) +
geom_density(alpha = 0.5) +
geom_vline(xintercept = cut_off_MT, colour="grey", linetype = "longdash") +
labs(x = expression('Pct of MT Expression'), title = "Mitochondrial Expression", fill = "Sample") +
theme_classic(base_size = 14)+
scale_fill_manual(values=c_sample_col)
multiplot(plot_reads, plot_MT, plot_mRNA, cols=2)
df <- data.frame(x=cdSc$log10_total_counts)
as_tibble(df) %>%
dplyr::mutate("Sample" = cdSc$Sample) %>%
ggplot( aes(x = x, fill = as.factor(Sample))) +
geom_density(alpha = 0.5) +
geom_vline(xintercept = cut_off_reads, colour="grey", linetype = "longdash") +
labs(x = expression('log'[10]*'(Library Size)'), title = "Total reads density", fill = "Sample") +
theme_classic(base_size = 14)+
scale_fill_manual(values=c_sample_col) + facet_wrap(~Sample, nrow=2,ncol=2)
The distribution of the reads, number of expressed genes and the percent of MT expression.
The dotted line represents the threshold which is the 3 Median Absolute Deviation (MAD). For the top two figures, cells on the left of this threshold would be filtered out and for the bottom figure, cells on the right of this figure would be filtered out.
Cells are removed with log-library sizes that are more than 3 median absolute deviations (MADs) below the median log-library size. (A log-transformation improves resolution at small values, especially when the MAD of the raw values is comparable to or greater than the median).
Similarly cells are removed where the log-transformed number of expressed genes is 3 MADs below the median.
Finally, cells are removed having pct of MT expression below 3 MAD.
dim(cdSc)
print(paste0('Cells removed if: '))
print(paste0('Read count below Libsize Cutoff: ',10^cut_off_reads))
print(paste0('Number of genes expressed below Feature Cutoff: ',10^cut_off_mRNA))
print(paste0('MT percent above Mito Cutoff: ',cut_off_MT))
libsize.drop <- isOutlier(cdSc$total_counts, nmads=3, type="lower", log=TRUE)
feature.drop <- isOutlier(cdSc$total_features_by_counts, nmads=3, type="lower", log=TRUE)
mito.drop <- isOutlier(cdSc$pct_counts_Mt, nmads=3, type="higher")
print(paste0('Number of cells removed:'))
data.frame(ByLibSize=sum(libsize.drop), ByFeature=sum(feature.drop),
ByMito=sum(mito.drop))
#Generate, the `scater` object with this filtered profile.
cdScFilt <- cdSc[,!(libsize.drop | feature.drop | mito.drop)]
cdScFilt <- calculateQCMetrics(cdScFilt)
print(paste0("Total cells before quality filtering = ",dim(cdSc)[2]))
print(paste0('Total cells before quality filtering, per sample:'))
table(colData(cdSc)$Sample)
print(paste0("Total cells remaining after quality filtering = ",dim(cdScFilt)[2]))
print(paste0('Total cells after quality filtering, per sample:'))
table(colData(cdScFilt)$Sample)
QC plots below to check result and decide whether further filtering is required.
## Before filtering cells
plotColData(cdSc, x="log10_total_counts",y="log10_total_features_by_counts", colour_by = "Sample") +
scale_fill_manual(values=c_sample_col)
# After filtering cells
plotColData(cdScFilt, x="log10_total_counts",y="log10_total_features_by_counts", colour_by = "Sample") +
scale_fill_manual(values=c_sample_col)
#before filtering cells
plotColData(cdSc, x = "log10_total_counts", y = "log10_total_features_by_counts", colour_by = "pct_counts_Mt")
#after filtering cells
plotColData(cdScFilt, x = "log10_total_counts", y = "log10_total_features_by_counts", colour_by = "pct_counts_Mt")
Violin plots below to investigate possible outlier cells with high read-depth (could be doublets).
df <- data.frame(Cell=colnames(cdScFilt), CellType=cdScFilt$Sample, totalFeatures=cdScFilt$total_features_by_counts, totalCount=cdScFilt$total_counts, PctTotalCountMt=cdScFilt$pct_counts_Mt, Sample=cdScFilt$Sample)
p1 <- ggplot(df, aes(factor(CellType),totalCount,colour=Sample))
p1 <- p1 + geom_violin() + geom_jitter(height = 0, width = 0.1) + theme_classic(base_size = 14) + scale_colour_manual(values=c_sample_col) +
theme(axis.text.x=element_text(angle=90, hjust=1))
p2 <- ggplot(df, aes(factor(CellType),PctTotalCountMt,colour=Sample))
p2 <- p2 + geom_violin() + geom_jitter(height = 0, width = 0.1) + theme_classic(base_size = 14) + scale_colour_manual(values=c_sample_col) +
theme(axis.text.x=element_text(angle=90, hjust=1))
p3 <- ggplot(df, aes(factor(CellType),totalFeatures,colour=Sample))
p3 <- p3 + geom_violin() + geom_jitter(height = 0, width = 0.1) + theme_classic(base_size = 14) + scale_colour_manual(values=c_sample_col) +
theme(axis.text.x=element_text(angle=90, hjust=1))
multiplot(p1,p2,p3,cols = 2)
A few outlier cells. Filtering is needed.
print(paste0("Total cells before filtering high read-depth cells = ",dim(cdScFilt)[2]))
print(paste0('Total cells before filtering for each sample:'))
table(colData(cdScFilt)$Sample)
#cdScFilt
__Manual cutoff selection required in the next step. Careful this changes cdsFilt permanently. make a temp file if unsure__
#careful this changes cdsFilt permanently make a temp file if unsure
cdScFilt <- cdScFilt[,!(cdScFilt$total_counts > 20000)]
cdScFilt
print(paste0("Total cells remaining after filtering high read-depth cells = ",dim(cdScFilt)[2]))
print(paste0('Total cells remaining after filtering for each sample:'))
table(colData(cdScFilt)$Sample)
#plots after filtering
df <- data.frame(Cell=colnames(cdScFilt), CellType=cdScFilt$Sample, totalFeatures=cdScFilt$total_features_by_counts, totalCount=cdScFilt$total_counts, PctTotalCountMt=cdScFilt$pct_counts_Mt, Sample=cdScFilt$Sample)
p1 <- ggplot(df, aes(factor(CellType),totalCount,colour=Sample))
p1 <- p1 + geom_violin() + geom_jitter(height = 0, width = 0.1) + theme_classic(base_size = 14) + scale_colour_manual(values=c_sample_col) +
theme(axis.text.x=element_text(angle=90, hjust=1))
p2 <- ggplot(df, aes(factor(CellType),PctTotalCountMt,colour=Sample))
p2 <- p2 + geom_violin() + geom_jitter(height = 0, width = 0.1) + theme_classic(base_size = 14) + scale_colour_manual(values=c_sample_col) +
theme(axis.text.x=element_text(angle=90, hjust=1))
p3 <- ggplot(df, aes(factor(CellType),totalFeatures,colour=Sample))
p3 <- p3 + geom_violin() + geom_jitter(height = 0, width = 0.1) + theme_classic(base_size = 14) + scale_colour_manual(values=c_sample_col) +
theme(axis.text.x=element_text(angle=90, hjust=1))
multiplot(p1,p2,p3,cols = 2)
save.image()
The prediction method used here is described by Scialdone et al. (2015) to classify cells into cell cycle phases based on the gene expression data. Using a training dataset, the sign of the difference in expression between two genes was computed for each pair of genes. Pairs with changes in the sign across cell cycle phases were chosen as markers. Cells in a test dataset can then be classified into the appropriate phase, based on whether the observed sign for each marker pair is consistent with one phase or another. We do the cell cycle classification before gene filtering as this provides more precise cell cycle phase classifications. This approach is implemented in the Cyclone function using a pre-trained set of marker pairs for human data. Some additional work is necessary to match the gene symbols in the data to the Ensembl annotation in the pre-trained marker set.
cdScFiltAnnot <- cdScFilt
__edits required human or mouse__
#human
#sample.pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds", package="scran"))
#anno <- select(org.Hs.eg.db, keys=as.character(rownames(cdScFiltAnnot)), keytype="SYMBOL", column="ENSEMBL")
#mouse
sample.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package="scran"))
anno <- select(org.Mm.eg.db, keys=as.character(rownames(cdScFiltAnnot)), keytype="SYMBOL", column="ENSEMBL")
ensembl <- anno$ENSEMBL[match(as.character(rownames(cdScFiltAnnot)), anno$SYMBOL)]
assignments <- cyclone(cdScFiltAnnot, sample.pairs, gene.names=ensembl)
df <- data.frame(x=assignments$score$G1, y=assignments$score$G2M, Sample=colData(cdScFilt)$Sample)
p<-ggplot(data=df, aes(x=x,y=y,color=Sample)) +
geom_point(size=0.5)+
xlab("G1 score")+
ylab("G2M score")+
ylim(0,1)+
xlim(0,1)+
ggtitle("Cell-cycle effects")+
theme_light(base_size=15)+
theme(axis.title.x = element_text(size=10, vjust=-2),
axis.text.x = element_text( size=10),
axis.title.y = element_text( size=10,vjust=2),
axis.text.y = element_text( size=10)) +
theme(plot.margin=unit(c(1,1,1.5,1.2),"cm"))+
theme(legend.text=element_text(size=10),#size of legend
legend.title=element_text(size=10),
plot.title = element_text(size=20, face="bold")) +
scale_colour_manual(values=c_sample_col) +
#scale_color_discrete(name=legend.title)+
geom_segment(aes(x = 1/2, y = 0, xend=1/2, yend=1/2),colour="black") +
geom_segment(aes(x = 0, y = 1/2, xend=1/2, yend=1/2),colour="black") +
geom_segment(aes(x = 1/2, y = 1/2, xend=1, yend=1),colour="black") +
annotate("text", x=0.05, y=0.05, label="S", size=8)+
annotate("text", x=0.95, y=0.25, label="G1", size=8)+
annotate("text", x=0.25, y=0.95, label="G2M", size=8)
print(p)
smoothScatter(assignments$score$G1, assignments$score$G2M, xlab="G1 score", ylab="G2/M score", pch=16, cex=0.6)
Background Cell Cycle Analysis
The cell-cycle status of cells can be a significant confounding factor in some datasets i.e. clusters forming on the basis of cell cycle status instead of other biological factors of interest. The goal at this stage is only to assess the cell cycle status of cells not to try normalise it away.
This method would be less accurate for data that are substantially different from those used in the training set, e.g., due to the use of a different protocol. This dataset uses UMI counts, which has an entirely different set of biases, e.g., 3’-end coverage only, no length bias, no amplification noise. These new biases (and the absence of expected biases) may interfere with accurate classification of some cells. So there is some uncertainty with this analysis.
Nevertheless we need to keep in mind that there could be quite high cell-cycle effect which might confound the dataset. To avoid problems from misclassification, no processing of this dataset by cell cycle phase will be done here.
This Dataset's Cell Cycle result It looks like majority of the cells have a high G1 score indicating that the cells are somewhat going through cell-cycle stages. Generally if they are on G1 stage, then they are not in cell-cycle stage and if on G2/M then they are cell-cycle stages. But here they are in S, which is the synthesis phase indicating DNA is being replicated. __edits required to this cell__
## Assigning cell-cycle stages to the scater object
colData(cdScFilt)$CellCycle <- assignments$phases
colData(cdScFiltAnnot)$CellCycle <- assignments$phases
cdScFilt
save.image('data.RData')
The goal is to keep genes that will be good features to discriminate the cells.
Low-abundance genes are problematic as zero or near-zero counts do not contain enough information for reliable statistical inference (Bourgon et al., 2010). In addition, the discreteness of the counts may interfere with downstream statistical procedures, e.g., by compromising the accuracy of continuous approximations.
The more low frequency, low expression genes the more noise.
These genes are likely to be dominated by drop-out events (Brennecke et al., 2013), which limits their usefulness in later analyses. Removal of these genes improves discreteness and reduces the amount of computational work without major loss of information.
Generally in scRNA-seq low-abundance genes are defined as those with an average count below a filter threshold of 1. But 10X Chromium is based on UMI counts, which are lower but better quality counts, so setting the threshold to 1 would filter a large number of cells unfairly.
In the analysis below we go through an iterative process using the figure below, starting with 0.01 to assess what cutoff to use. To check whether the chosen threshold is suitable, we examine the distribution of log-means across all genes (Figure below). Generally for higher number of cells there is a peak on the right hand side that represents the bulk of moderately expressed genes while in the middle there is a rectangular component that corresponds to lowly expressed genes. The filter threshold should cut the distribution at some point along the rectangular component to remove the majority of low-abundance genes. As the blue line repsresents in the figure below, it cuts the counts at the rectangular component. Generally 9,000 - 14,000 genes is good (mouse) and 7,500-14,000 for human.
# gene counts for both cut offs
ave.counts <- rowMeans(counts(cdScFilt))
keepA <- ave.counts >= 0.01
keepB <- ave.counts >= 0.02
print(paste0('Log10 av count >=0.01 leaves ',as.character(sum(keepA)), ' genes '))
print(paste0('Log10 av count >=0.02 leaves ',as.character(sum(keepB)), ' genes '))
# plot of both cut offs
hist(log10(ave.counts), breaks=100, main="", col="grey80",
xlab=expression(Log[10]~"average count"))
abline(v=log10(0.02), col="blue", lwd=2, lty=2)
text(log10(0.035),1000, "0.02")
abline(v=log10(0.01), col="blue", lwd=2, lty=2)
text(log10(0.006), 1000, "0.01")
__Decide cutoff__
ave.counts <- rowMeans(counts(cdScFilt))
keep <- ave.counts >= 0.01
print(paste0('Log10 av count adjustment now leaves ',as.character(sum(keep)), ' genes '))
#sum(keep)
plotHighestExprs(cdScFiltAnnot)
Check identities of the most highly expressed genes before filtering them. This should generally be dominated by constitutively expressed transcripts, such as those for ribosomal or mitochondrial proteins. The presence of other classes of features may be cause for concern if they are not consistent with expected biology. For example, the absence of ribosomal proteins and/or the presence of their pseudogenes are indicative of suboptimal alignment.
Plot shows percentage of total counts (per cell) assigned to the top 50 most highly-abundant features in the dataset.
For each feature, each bar represents the percentage assigned to that feature for a cell, while the circle represents the average across all cells. Bars are coloured by the total number of expressed features in each cell, while circles are coloured according to whether the feature is labelled as a control feature.
__Do the topmost genes appear to agree with the biology?__
__NOTE permanent change to cdScFiltAnnot next!__
This mean-based filter tends to be less aggressive. A gene will be retained as long as it has sufficient expression in any subset of cells. Genes expressed in fewer cells require higher levels of expression in those cells to be retained, but this is not undesirable as it avoids selecting uninformative genes (with low expression in few cells) that contribute little to downstream analyses, e.g., HVG detection or clustering. In contrast, the “at least n” filter depends heavily on the choice of n. With n = 10, a gene expressed in a subset of 9 cells would be filtered out, regardless of the level of expression in those cells. This may result in the failure to detect rare subpopulations that are present at frequencies below n. While the mean-based filter will retain more outlier-driven genes, this can be handled by choosing methods that are robust to outliers in the downstream analyses. Thus, we apply the mean-based filter to the data by subsetting the SCESet object as shown below.
cdScFilt <- cdScFilt[keep,]
cdScFiltAnnot <- cdScFilt
cdScFiltAnnot
plotColData(cdScFilt, x="log10_total_counts",y="total_features_by_counts", colour_by = "Sample") + scale_fill_manual(values=c_sample_col)
plotExprsFreqVsMean(cdScFiltAnnot)
Frequency of the cells versus mean expression. Preferably at least 1000 genes will be expressed in 50% of the cells. These uniformly expressed genes are not helpful in disciminating cells, but if the number of genes expressed in >50% is low, that means the dataset quality is not great because too many dropouts.
save.image('data.RData')
Single cell RNA-seq data requires different normalisation to bulk data methods (e.g. DESeq2) because scRNA-seq is very sparse. Here the deconvolution based method will be used.
Further detail on the deconvolution method to deal with zero counts: Read counts are subject to differences in capture efficiency and sequencing depth between cells (Stegle et al., 2015). Normalization is required to eliminate these cell-specific biases prior to downstream quantitative analyses. In bulk data this is often done by assuming that most genes are not differentially expressed (DE) between cells. Any systematic difference in count size across the non-DE majority of genes between two cells is assumed to represent bias and is removed by scaling. More specifically, “size factors” are calculated that represent the extent to which counts should be scaled in each library. Single-cell data can be problematic due to the dominance of low and zero counts. To overcome this, counts from many cells are pooled to increase the count size for accurate size factor estimation (Lun et al., 2016). Pool-based size factors are then “deconvolved” into cell-based factors for cell-specific normalization.
cdScFiltAnnot <- computeSumFactors(cdScFiltAnnot, sizes=c(10, 50, 100, 150))
summary(sizeFactors(cdScFiltAnnot))
__If pool size is too big then in some datasets an error will be called ("estimated negative size factor"). In that case go down to smaller pool sizes (e.g. 100, 150, 200, 250, 300). If still not low enough keep going reducing the size (50,100,150,200,300,400). But this can also give "estimated negative size factor". Adjustments to these step sizes can be tried if the tSNE plots indicate that library size is dominating cluster selection.__
DF <- data.frame(VAR1=sizeFactors(cdScFiltAnnot), VAR2=cdScFiltAnnot$total_counts/1e6)
model = lm(VAR2 ~ VAR1, DF)
summary(model)
sp = ggplot(DF, aes(x=VAR1, y=VAR2))
sp + geom_point() + stat_smooth(method=lm) + theme_light(base_size=15) +
theme(strip.background = element_blank(),
panel.border = element_blank(),
plot.title = element_text(hjust = 0.5)) +
xlab("Size Factor") +
ylab("Library Size (millions)")+
annotate("text", label="R^2=0.90", x=2.8, y=0.025)
If the size factors are tightly correlated with the library sizes for all cells this would suggests that the systematic differences between cells are primarily driven by differences in capture efficiency or sequencing depth. Any DE between cells would yield a non-linear trend between the total count and size factor, and/or increased scatter around the trend.
With a high R^2 value, the size factors are very tightly correlated with the library size. I calculate the size factor normization and put it in a slot of SummarizedExperiment object.
Applying the size factors to normalize gene expression:
The count data are used to compute normalized log-expression values for use in downstream analyses. Each value is defined as the log-ratio of each count to the size factor for the corresponding cell, after adding a prior count of 1 to avoid undefined values at zero counts. Division by the size factor ensures that any cell-specific biases are removed. If spike-in-specific size factors are present in sce, they will be automatically applied to normalize the spike-in transcripts separately from the endogenous genes.
Recalculate the counts-per-million for these cells. This is because large number of genes have been filtered out which would have impacted the library size when CPM was calcualted with unfiltered data. Now, as the number of genes have been reduced, so is the library size.
exprs(cdScFiltAnnot) <- log2(calculateCPM(cdScFiltAnnot, use_size_factors = TRUE) + 1)
Setting the parameter use.size.factors = TRUE enables the use of step sizes calculated in the above pool based method.
The log-transformation provides some measure of variance stabilization (Law et al., 2014), so that high-abundance genes with large variances do not dominate downstream analyses. The computed values are stored as an exprs matrix in addition to the other assay elements.
Check whether there are technical factors that contribute substantially to the heterogeneity of gene expression. If so, the factor may need to be regressed out to ensure that it does not inflate the variances or introduce spurious correlations. For a dataset with a simple experimental design there are no plate or batch effects to examine. However, there could be other technical factors like the cell-cycle effect or dropouts. The plots below show before and after normalisation.
cdScFiltAnnot
names(colData(cdScFiltAnnot))
#before normalisation
plotExplanatoryVariables(cdScFiltAnnot, exprs_values = "counts", variables=c("total_features_by_counts", "total_counts", "CellCycle","pct_counts_Mt", "Sample"))
#after normalisation
plotExplanatoryVariables(cdScFiltAnnot, variables=c("total_features_by_counts", "total_counts", "CellCycle", "pct_counts_Mt", "Sample"))
Assessing this plot. Typically the number of total features (genes) and total counts explain the major variability in scRNA-seq datasets. This indicates that cells vary significantly based on number of genes that they are expressing.
Number of total features explaining majority of the variability is a common issue in single-cell RNA-seq. This could be due to the sparsity of the data. The PCA plot might be separating cells just because they have very different number of features expressed in total.
However, this would not confound the t-SNE plot which would take the non-linear relationship between the variables. As long as the peaks for _totalcounts and _total_feature_bycounts are below 10 this is normal.
In the cell-cycle plot shown earlier and this plot help establish the importance of the cell-cycle effect.
The goal is to identify genes that will be good features to discriminate the cells. In this step, the dataset is filtered to keep only genes that are “informative” of the variability in the data. Highly Variable Genes (HVG) are the ones driving heterogeneity across the population of cells.
Detail HVG identification requires estimation of the variance in expression for each gene, followed by decomposition of the variance into biological and technical components. HVGs are then identified as those genes with the largest biological components. This avoids prioritizing genes that are highly variable due to technical factors, such as sampling noise during RNA capture and library preparation.
In the recent implementation of seurat, Rahul Satija took a slightly different approach for HVG calculation. They calculate the variance and mean for each gene in the dataset (storing this in object@hvg.info), and sorts genes by their variance/mean ratio (VMR). They have observed that for large-cell datasets with unique molecular identifiers, selecting highly variable genes (HVG) simply based on VMR is an efficient and robust strategy.
But in the implementation used here a trend is fitted to the variance estimates of the endogenous genes, using the use.spikes=FALSE setting as shown below. This assumes that the majority of genes are not variably expressed, such that the technical component dominates the total variance for those genes. The fitted value of the trend is then used as an estimate of the technical component.
span: Low-abundance genes with mean log-expression below min.mean are not used in trend fitting, to preserve the sensitivity of span-based smoothers at moderate-to-high abundances. It also protects against discreteness, which can interfere with estimation of the variability of the variance estimates and accurate scaling of the trend. The default threshold is chosen based on the point at which discreteness is observed in variance estimates from Poisson-distributed counts. For heterogeneous droplet data, a lower threshold of 0.001-0.01 should be used.
var.fit <- trendVar(cdScFiltAnnot, method="loess", use.spikes=FALSE, min.mean=1.0)
#or scater default
#var.fit <- trendVar(cdScFiltAnnot, method="loess", use.spikes=FALSE)#default
#var.fit <- trendVar(cdScFiltAnnot, method="loess", use.spikes=FALSE, min.mean=0.1)
var.out <- decomposeVar(cdScFiltAnnot, var.fit)
var.out
We assess the suitability of the trend fitted to the endogenous variances by examining whether it is consistent with the variances. The trend passes through or close to most of the endogenous gene variances, indicating that our assumption (that most genes have low levels of biological variability) is valid. This strategy exploits the large number of endogenous genes to obtain a stable trend.
plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression", ylab="Variance of log-expression")
o <- order(var.out$mean)
lines(var.out$mean[o], var.out$tech[o], col="dodgerblue", lwd=2)
Ideally the plot above would look like the mountain, where it would have rise in the middle but then drop off at the end (low variance for highly expressed genes). The trend line would follow it as well.
HVG are defined as genes with biological components that are significantly greater than zero at a false discovery rate (FDR) of 5%. These genes are interesting as they drive differences in the expression profiles between cells, and should be prioritized for further investigation. Here a gene is considered to be a HVG if it has a biological component greater than or equal to 0.5. For transformed expression values on the log2 scale, this means that the average difference in true expression between any two cells will be at least 2-fold. (This reasoning assumes that the true log-expression values are Normally distributed with variance of 0.5. The root-mean-square of the difference between two values is treated as the average log2-fold change between cells and is equal to unity.) The results are ranked by the biological component to focus on genes with larger biological variability. HVG can number anywhere between 200-5000 depending on the complexity of the dataset and depth of sequencing https://www.embopress.org/doi/10.15252/msb.20188746
hvg.out <- var.out[which(var.out$FDR <= 0.05 & var.out$bio >= 0.5),]
hvg.out <- hvg.out[order(hvg.out$bio, decreasing=TRUE),]
print(paste0('Number of HVG using this setting is ' ,as.character(nrow(hvg.out))))
head(rownames(hvg.out),20)
plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression",
ylab="Variance of log-expression")
o <- order(var.out$mean)
lines(var.out$mean[o], var.out$tech[o], col="dodgerblue", lwd=2)
points(var.out[rownames(hvg.out),]$mean, var.out[rownames(hvg.out),]$total, col="red", pch=16, cex=0.6)
Plot showing the HVG in red.
## Plotting the top 15 HVGs
plotExpression(cdScFiltAnnot, rownames(hvg.out)[1:15])
write.table(file="HVG.xls", hvg.out, sep="\t", quote = FALSE, col.names = NA)
head(hvg.out, 20)
Final Notes There are alternative approaches for determining the HVG, sspecially those based on Coefficient of Variance. The method used here, the variance of the log-expression values, avoids genes with strong expression in only one or two cells. This ensures that the set of top HVGs is not dominated by genes with (mostly uninteresting) outlier expression patterns.
However, it has been mentioned that fitting the trendline to endogenous genes might not always be a good idea.
save.image('data.RData')
1. PCA
2. t-SNE
3. UMAP
rowVarsSorted <- exprs(cdScFiltAnnot)[rownames(hvg.out),]
FinalPCAData <- t(rowVarsSorted)
pcaPRComp <- prcomp(FinalPCAData)
nmax = 10
if(dim(pcaPRComp$x)[1] < 10){ nmax = dim(pcaPRComp$x)[1] }
txt1 <- paste("Percent_PC_Var_onfirst",nmax,"PCs",sep="")
pca_var = pcaPRComp$sdev ^ 2
pca_var_percent <- 100 * pca_var / sum(pca_var)
pca_var_percent_first10 <- NA * pca_var
pca_var_percent_first10[1:nmax] <- 100 * pca_var[1:nmax] / sum(pca_var[1:nmax])
#pca_corr_reads <- apply(pcaPRComp$x,2,function(x) cor(x,report_sub$Assigned))
pca_var_out <- data.frame(round(pca_var,3),round(pca_var_percent,1),
round(pca_var_percent_first10,1))
#rownames(pca_var_out) <- rownames(pcaPRComp$x)
colnames(pca_var_out) <- c("PC_Var","PC_Var_percent",txt1)
nColToDisplay = 5
df <- as.data.frame(pcaPRComp$x)
df$Cell=as.factor(colData(cdScFiltAnnot)$Sample)
p <- ggpairs(df, columns=1:nColToDisplay, upper=list(continuous="points"),
title='Plotting first four PCAs',
mapping = aes_string(color="Cell"),
legend = c(1,nColToDisplay),
columnLabels = as.character(paste0(colnames(df[,1:nColToDisplay]), ' : ',
pca_var_out$PC_Var_percent[1:nColToDisplay], '% variance')))+
theme_light(base_size=15)+
theme(plot.title = element_text(hjust = 0.5))
for(i in 1:p$nrow) {
for(j in 1:p$ncol){
p[i,j] <- p[i,j] +
scale_colour_manual(values=c_sample_col)+
scale_fill_manual(values=alpha(c(c_sample_col),0.5))
}
}
print(p)
#PC 1v2
df_out <- as.data.frame(pcaPRComp$x)
df_out$group <- sapply( strsplit(as.character(row.names(df)), "_"), "[[", 1 )
Samples=as.factor(colData(cdScFiltAnnot)$Sample)
pp12<-ggplot(df_out,aes(x=PC1,y=PC2,color=Samples ))
pp12<-pp12+geom_point()+
scale_colour_manual(values=c_sample_col)+
xlab(paste0("PC1: ",round(pca_var_percent_first10[1],1),"% variance")) +
ylab(paste0("PC2: ",round(pca_var_percent_first10[2],1),"% variance"))
#pca_var_percent_first10[1] <- 100 * pca_var[1:nmax] / sum(pca_var[1:nmax])
#PC 3v4
df_out <- as.data.frame(pcaPRComp$x)
df_out$group <- sapply( strsplit(as.character(row.names(df)), "_"), "[[", 1 )
Samples=as.factor(colData(cdScFiltAnnot)$Sample)
pp34<-ggplot(df_out,aes(x=PC3,y=PC4,color=Samples ))
pp34<-pp34+geom_point()+
scale_colour_manual(values=c_sample_col)+
xlab(paste0("PC3: ",round(pca_var_percent_first10[3],1),"% variance")) +
ylab(paste0("PC4: ",round(pca_var_percent_first10[4],1),"% variance"))
multiplot(pp12,pp34, cols=1)
__Note t-SNE calculated next. Everytime you run this step the t-SNE plot will look different.__
tsne_out <- Rtsne(as.matrix(pcaPRComp$x[,1:14]),check_duplicates = FALSE, pca = FALSE,
perplexity=30, theta=0.01, dims=2, num_threads = 4)
Rep <- as.factor(colData(cdScFiltAnnot)$Sample)
counts <- colData(cdScFiltAnnot)$total_counts
p2 <- ggplot(as.data.frame(tsne_out$Y), aes(x=V1, y=V2, color=Rep)) +
geom_point(size=0.75) +
guides(colour = guide_legend(override.aes = list(size=0.8))) +
xlab("") + ylab("") +
ggtitle("t-SNE 2D coloured by sample") +
theme_classic(base_size=10) +
theme(strip.background = element_blank(),
strip.text.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.line = element_blank(),
panel.border = element_blank()) +
scale_fill_manual(values=c_sample_col) +
scale_colour_manual(values=c_sample_col)
p2
embedding <- umap(pcaPRComp$x[,1:14])
as.tibble(embedding$layout) %>%
mutate(Samples = colData(cdScFiltAnnot)$Sample) %>%
ggplot(aes(V1, V2, color=Samples)) + geom_point(size=0.75) + theme_classic() +scale_colour_manual(values=c_sample_col)
save.image('data.RData')
The goal is to split the cells in the dataset into clusters, such that:
There are numerous methods of clustering; dynamic-cut-tree, k-means, k-medoids, GMM, GP-LVM and so on. Here the dynamic-cut-tree will be used.
Hierarchical clustering is a widely used method for detecting clusters in genomic data. Clusters are defined by cutting branches off the dendrogram. A common but inflexible method uses a constant height cutoff value; this method exhibits suboptimal performance on complicated dendrograms. The Dynamic Tree Cut R library implements novel dynamic branch cutting methods for detecting clusters in a dendrogram depending on their shape. Compared to the constant height cutoff method, these techniques offer the following advantages: (1) they are capable of identifying nested clusters; (2) they are flexible --- cluster shape parameters can be tuned to suit the application at hand; (3) they are suitable for automation; and (4) they can optionally combine the advantages of hierarchical clustering and partitioning around medoids, giving better detection of outliers.
Hierarchical clustering is performed on the Euclidean distances between cells, using Ward’s criterion to minimize the total variance within each cluster. This yields a dendrogram that groups together cells with similar expression patterns across the chosen genes.
Clusters are explicitly defined by applying a dynamic tree cut (Langfelder et al., 2008)[https://academic.oup.com/bioinformatics/article/24/5/719/200751] to the dendrogram. This exploits the shape of the branches in the dendrogram to refine the cluster definitions, and is more appropriate than cutree for complex dendrograms. Greater control of the empirical clusters can be obtained by manually specifying cutHeight in cutreeDynamic.
An alternative approach is to cluster on a matrix of distances derived from correlations (e.g., as in quickCluster). This is more robust to noise and normalization errors, but is also less sensitive to subtle changes in the expression profiles.
chosen.exprs <- logcounts(cdScFiltAnnot[rownames(hvg.out),])
my.dist <- dist(t(chosen.exprs))
my.tree <- hclust(my.dist, method = "ward.D2")
my.clusters <- unname(cutreeDynamic(my.tree, distM=as.matrix(my.dist), verbose=0))
print(paste0('Number of clusters'))
levels(as.factor(my.clusters))
cdScFiltAnnot$Clusters <- as.factor(my.clusters)
heat.vals <- chosen.exprs - rowMeans(chosen.exprs)
#clust.col <- rainbow(max(my.clusters))
#heatmap.2(heat.vals, col=bluered, symbreak=TRUE, trace='none', cexRow=0.3,
# ColSideColors=clust.col[my.clusters], Colv=as.dendrogram(my.tree))
df = data.frame(Class = as.factor(my.clusters))
ha = HeatmapAnnotation(df = df)
#ha = HeatmapAnnotation(df = df, col = list(Condition = c("MC_A" = "dodgerblue", "MC_B"= "firebrick", "MC_C"="forestgreen", "MC_D"="gold")))
Heatmap(heat.vals , top_annotation = ha, show_column_names=FALSE, cluster_rows = TRUE, clustering_method_columns = "ward.D2", row_names_gp = gpar(fontsize = 8))
pdf("heatmap.pdf", width=20, height=40)
Heatmap(heat.vals , top_annotation = ha, show_column_names=FALSE, cluster_rows = TRUE, clustering_method_columns = "ward.D2", row_names_gp = gpar(fontsize = 8))
dev.off()
## t-SNE with cluster allocation
Clusters <- as.factor(my.clusters)
p1 <- ggplot(as.data.frame(tsne_out$Y), aes(x=V1, y=V2, color=Clusters)) +
geom_point(size=1.0) +
guides(colour = guide_legend(override.aes = list(size=1))) +
xlab("") + ylab("") +
ggtitle("t-SNE 2D coloured by clusters") +
theme_classic(base_size=10) +
theme(strip.background = element_blank(),
strip.text.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.line = element_blank(),
panel.border = element_blank()) + scale_color_manual(values=c_clust_col)
p1
## t-SNE with Cell-cycle
CellCycle <- as.factor(cdScFiltAnnot$CellCycle)
p2 <- ggplot(as.data.frame(tsne_out$Y), aes(x=V1, y=V2, color=CellCycle)) +
geom_point(size=1.0) +
guides(colour = guide_legend(override.aes = list(size=1))) +
xlab("") + ylab("") +
ggtitle("t-SNE 2D coloured by cell-cycle") +
theme_classic(base_size=10) +
theme(strip.background = element_blank(),
strip.text.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.line = element_blank(),
panel.border = element_blank())
p2
## t-SNE with Sample
Sample <- as.factor(cdScFiltAnnot$Sample)
p3 <- ggplot(as.data.frame(tsne_out$Y), aes(x=V1, y=V2, color=Sample)) +
geom_point(size=1.0) +
guides(colour = guide_legend(override.aes = list(size=1))) +
xlab("") + ylab("") +
ggtitle("t-SNE 2D coloured by sample") +
theme_classic(base_size=10) +
theme(strip.background = element_blank(),
strip.text.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.line = element_blank(),
panel.border = element_blank())
p3
## t-SNE Clustering impacted with read-counts
p4 <- as.tibble(as.data.frame(tsne_out$Y)) %>%
mutate(ReadDepth = cdScFiltAnnot$log10_total_counts) %>%
ggplot(aes(V1, V2, color=ReadDepth)) + geom_point(size=0.75) + theme_classic() +
scale_colour_gradientn(colours = rainbow(5))
p4
multiplot(p1,p2,p3,p4, cols=2)
## UMAP with cluster allocation
Clusters <- as.factor(my.clusters)
as.tibble(embedding$layout) %>%
mutate(Clusters = Clusters) %>%
ggplot(aes(V1, V2, color=Clusters)) + geom_point(size=0.75) + theme_classic() + scale_color_manual(values=c_clust_col)
## UMAP with sample colouring
as.tibble(embedding$layout) %>%
mutate(Samples = colData(cdScFiltAnnot)$Sample) %>%
ggplot(aes(V1, V2, color=Samples)) + geom_point(size=0.75)+ geom_point(alpha = 0.5) + theme_classic() +scale_colour_manual(values=c_sample_col)
## UMAP view clustering impacted with read counts
as.tibble(embedding$layout) %>%
mutate(ReadDepth = cdScFiltAnnot$log10_total_counts) %>%
ggplot(aes(V1, V2, color=ReadDepth)) + geom_point(size=0.75) + theme_classic() +
scale_colour_gradientn(colours = rainbow(5))
Note on interpreting plots Ideally clustering will determined by biology and not read count. Likewise the t-SNE and UMAP should not be dominated by the read depth. However, this is a known problem Hafemeister et. al. 2019.
#how many cells from each sample are in each cluster?
dfTemp<-data.frame(Sample=cdScFiltAnnot$Sample,Clusters=cdScFiltAnnot$Clusters)
table(dfTemp)
write.table(table(dfTemp), file="ClusterPopulations.xls", sep="\t", quote=FALSE, col.names=NA)
The silhouette plot displays a measure of how close each point in one cluster is to points in the neighboring clusters. Silhouette width can be interpreted as follows:
In this section, we describe the most widely used clustering validation indices. Recall that the goal of partitioning clustering algorithms (Part @ref(partitioning-clustering)) is to split the data set into clusters of objects, such that:
Internal validation measures often reflect the compactness, the connectedness and the separation of the cluster partitions.
Generally most of the indices used for internal clustering validation combine compactness and separation measures as follow:
$Index = \frac{\alpha * Seperation}{\beta * Compactness}$ where $\alpha$ and $\beta$ are weights.
The silhouette analysis measures how well an observation is clustered and it estimates the average distance between clusters. The silhouette plot displays a measure of how close each point in one cluster is to points in the neighboring clusters.
For each observation $i$, the silhouette width $s_i$ is calculated as follows:
Silhouette width can be interpreted as follow:
fviz_silhouette(silhouette(my.clusters, my.dist), print.summary = FALSE)
#objects required for Shiny app
reducedDim(cdScFiltAnnot,'tSNE') <- tsne_out$Y
reducedDim(cdScFiltAnnot,'UMAP') <- embedding$layout
cdScFiltAnnot
In the following plots we visualize the expression of XIST in individual cells. We show the same plot with different colour palletes.
GeneExp <- logcounts(cdScFiltAnnot)['XIST',]
#GeneName = 'SPN'
df <- as.data.frame(tsne_out$Y)
df[,'GeneExp']=logcounts(cdScFiltAnnot)['XIST',]
p1<- ggplot(df, aes(x=V1, y=V2, GeneExp = GeneExp)) +
geom_point(size=1.00,aes(colour = GeneExp), alpha=0.8) +
#scale_colour_viridis_c()+
scale_colour_gradient(low = "gray88", high = "red")+
#guides(colour = guide_legend(override.aes = list(size=4))) +
xlab("") + ylab("") +
ggtitle(paste0('Gene Exp:','XIST'))+
theme_classic(base_size=14) +
theme(strip.background = element_blank(),
strip.text.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.line = element_blank(),
panel.border = element_blank())
p1
GeneExp <- logcounts(cdScFiltAnnot)['XIST',]
#GeneName = 'SPN'
df <- as.data.frame(tsne_out$Y)
df[,'GeneExp']=logcounts(cdScFiltAnnot)['XIST',]
p1<- ggplot(df, aes(x=V1, y=V2, GeneExp = GeneExp)) +
geom_point(size=1.00,aes(colour = GeneExp), alpha=0.8) +
#scale_colour_viridis_c()+
scale_colour_gradientn(colours = hcl.colors(n=4, palette = 'RdYlBu'))+
#guides(colour = guide_legend(override.aes = list(size=4))) +
xlab("") + ylab("") +
ggtitle(paste0('Gene Exp:','XIST'))+
theme_classic(base_size=14) +
theme(strip.background = element_blank(),
strip.text.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.line = element_blank(),
panel.border = element_blank())
p1
GeneExp <- logcounts(cdScFiltAnnot)['XIST',]
#GeneName = 'SPN'
df <- as.data.frame(tsne_out$Y)
df[,'GeneExp']=logcounts(cdScFiltAnnot)['XIST',]
p1<- ggplot(df, aes(x=V1, y=V2, GeneExp = GeneExp)) +
geom_point(size=1.00,aes(colour = GeneExp), alpha=0.8) +
scale_colour_viridis_c()+
#scale_colour_gradient(low = "gray88", high = "purple4")+
#guides(colour = guide_legend(override.aes = list(size=4))) +
xlab("") + ylab("") +
ggtitle(paste0('Gene Exp:','XIST'))+
theme_classic(base_size=14) +
theme(strip.background = element_blank(),
strip.text.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.line = element_blank(),
panel.border = element_blank())
p1
saveRDS(cdScFiltAnnot,'cdScFiltAnnot.rds')
save.image('data.RData')
Potential marker genes are identified by taking the top set of DE genes from each pairwise comparison between clusters. The results are arranged into a single output table that allows a marker set to be easily defined for a user-specified size of the top set. For example, to construct a marker set from the top 10 genes of each comparison, one would filter marker.set to retain rows with Top less than or equal to 10.
__note any poor clusters in the silhouette plot and consider how trustworthy the marker genes actually are.__
cluster <- factor(my.clusters)
de.design <- model.matrix(~0 + cluster)
y <- convertTo(cdScFiltAnnot, type="edgeR")
y <- estimateDisp(y, de.design)
fit <- glmFit(y, de.design)
summary(y$tagwise.dispersion)
clust.col <- rainbow(max(my.clusters))
clusterNumber <- max(my.clusters)
pairwiseComparisonNumber <- choose(clusterNumber,2)
compClusters <- length(levels(as.factor(my.clusters)))-1
endClusters <- (3+length(levels(as.factor(my.clusters)))-2)
# Set this up if you are running the following code parallel
## setup parallel backend to use many processors
#cores=detectCores()
#cl <- makeCluster(6) #Have choosen 6 cores here
#registerDoParallel(cl)
result.logFC <- result.FDR <- list()
#foreach(i=c(1:(clusterNumber))) %dopar% {
for(i in c(1:(clusterNumber))){
for(j in c(1:clusterNumber))
{
if(i == j){ next }
print(paste0('In ',i,", in ",j))
contrast <- numeric(ncol(de.design))
contrast[i] <- 1
contrast[j] <- -1
fit <- edgeR::glmQLFit(y, de.design)
res <- edgeR::glmQLFTest(fit, contrast = contrast)
top.tags <- edgeR::topTags(res, n=length(y$design), sort.by="none")
con.name <- paste0('vs.', levels(cluster)[j])
result.logFC[[con.name]] <- top.tags$table$logFC
#names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
result.FDR[[con.name]] <- top.tags$table$FDR
#names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
collected.ranks <- lapply(result.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)
marker.set <- data.frame(Top=min.rank, Gene=rownames(top.tags),
logFC=do.call(cbind, result.logFC),
FDR = result.FDR, stringsAsFactors=FALSE)
marker.set <- marker.set[order(marker.set$Top),]
marker.set.pos <- marker.set[rowSums(marker.set[,3:endClusters]>0)==compClusters,]
marker.set.pos <- marker.set.pos[order(marker.set.pos$Top),]
write.table(marker.set, file=paste0("SC_workshop_Cluster",i,".tsv"), sep="\t", quote=FALSE, col.names=NA)
write.table(marker.set.pos, file=paste0("SC_workshop_Cluster",i,"_pos.tsv"), sep="\t", quote=FALSE, col.names=NA)
}
Here we generate the file that can be directly imported into IPA for donwstream analysis.
#Each cluster against other cells pooled for IPA
for(clust in levels(as.factor(my.clusters)))
#for(clust in c('1','2'))
{
counts1 <- counts(cdScFiltAnnot)[,colData(cdScFiltAnnot)$Clusters %in% clust]
counts2 <- counts(cdScFiltAnnot)[,!(colData(cdScFiltAnnot)$Clusters %in% clust)]
countsTable <- cbind(counts1,counts2)
conds.Sel <- c(rep("1", dim(counts1)[2]), rep("2", dim(counts2)[2]))
print(table(conds.Sel))
res1 <- nbTestSH(countsTable, conds.Sel, "1", "2")
res1[,'FDR'] <- p.adjust(res1[,7], method = "bonferroni")
res1<-res1[,c('rawLog2FoldChange','pval','FDR')]
colnames(res1) <- paste0('Cl_',clust,'_',colnames(res1))
if(clust %in% "1"){
resAll <- res1
}
else{
resAll <- cbind(resAll, res1)
}
}
write.table(resAll, file="IPA_each_cluster_v_other_cells.xls", sep="\t", quote=FALSE, col.names=NA)
cdScFiltAnnot
save.image('data.RData')